{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模块导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 762,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "import selenium\n",
    "import xlwt\n",
    "import base64\n",
    "import json\n",
    "import requests\n",
    "import os\n",
    "import requests_html\n",
    "from requests_html import HTMLSession\n",
    "from selenium import webdriver\n",
    "from PIL import Image, ImageEnhance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  登录知网"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 763,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-763-f52ab15e8050>:17: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "#opts.binary_location = \"/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --version\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "# \"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 764,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://www.cnki.net/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查登录状态"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 765,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 765,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#登录\n",
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 进入高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 728,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "高级检索\n"
     ]
    }
   ],
   "source": [
    "#高级检索\n",
    "element = driver.find_element_by_id('highSearch')\n",
    "print(element.get_attribute('innerHTML'))\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查窗口位置信息\n",
    "只要是多个窗口，要检查窗口信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 729,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-FE7C6BB896D959E412F7384BE2E7AD69'"
      ]
     },
     "execution_count": 729,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##当前窗口信息\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 730,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-FE7C6BB896D959E412F7384BE2E7AD69',\n",
       " 'CDwindow-9986C2D8B52C65E0B8964A1EEF38790E']"
      ]
     },
     "execution_count": 730,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##所有窗口信息\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 731,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-731-2c997ac77236>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "#切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 732,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<span>学术期刊</span><em></em>\n"
     ]
    }
   ],
   "source": [
    "## 点击学术期刊\n",
    "element = driver.find_element_by_xpath('//ul[@class=\"doctype-menus keji\"]/li[@data-id=\"xsqk\"]/a')\n",
    "print(element.get_attribute('innerHTML'))\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击专业检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 733,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "专业检索\n"
     ]
    }
   ],
   "source": [
    "##点击专业检索\n",
    "element = driver.find_element_by_name('majorSearch')\n",
    "print(element.get_attribute('innerHTML'))\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选择期刊来源"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 734,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击\"SCI\"来源\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[2]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 735,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击\"EI\"来源\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[3]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 736,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击\"CSSCI\"\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[5]/input')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置搜索的query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 737,
   "metadata": {},
   "outputs": [],
   "source": [
    "query= 'SU = \"融媒体\" '"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 738,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 搜索并查看页面信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 741,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击检索\n",
    "element = driver.find_element_by_xpath('//input[@value=\"检索\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 742,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "共找到<em>1,416</em>条结果\n"
     ]
    }
   ],
   "source": [
    "## 检索数量\n",
    "element = driver.find_element_by_xpath('//span[@class=\"pagerTitleCell\"]')\n",
    "print(element.get_attribute('innerHTML'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 746,
   "metadata": {},
   "outputs": [],
   "source": [
    "#选择每页显示50篇\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[2]/div/div/div/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 747,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[2]/div/div/ul/li[3]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 748,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>多元环境下学术期刊品牌建设的思考  网络首发</td>\n",
       "      <td>宋启凡</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2021-06-29 17:21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>国家治理视角下县级融媒体中心传播功能再解读  网络首发</td>\n",
       "      <td>倪琳</td>\n",
       "      <td>东岳论丛</td>\n",
       "      <td>2021-06-23 14:01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>76.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>国家治理视角下县级融媒体中心传播功能再解读</td>\n",
       "      <td>倪琳</td>\n",
       "      <td>东岳论丛</td>\n",
       "      <td>2021-06-23 14:01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>县级融媒体如何推动政治倾听与舆论引导——以河南省新冠肺炎疫情防控为例</td>\n",
       "      <td>杨逍; 王子丰</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-06-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>“使用与满足”的凸显——融媒体时代电视综艺节目发展的新思路</td>\n",
       "      <td>陈淼; 王云峰</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>296.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>融媒体时代主题出版转型发展的探究  网络首发</td>\n",
       "      <td>区燕宜</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2021-06-11 13:33</td>\n",
       "      <td>NaN</td>\n",
       "      <td>97.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>论融媒体素养对新文科人才培养的意义</td>\n",
       "      <td>白寅</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>77.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>欠发达地区县级融媒体中心建设的难点与出路</td>\n",
       "      <td>李琴</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>28.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>大河网2021年全国两会报道创新策略研究</td>\n",
       "      <td>王亚楠</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>融媒时代中国故事的叙事与传播创新策略——以人民日报《中国24小时》系列微视频为例</td>\n",
       "      <td>刘珊</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>数据融通：县级融媒体中心的数据利用与功能发挥</td>\n",
       "      <td>沈晖</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>南京广电的融媒体创新与升级策略</td>\n",
       "      <td>秘春茜; 李宏刚</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>现象级融媒传播精品策划研究——以福州广播电视台为例</td>\n",
       "      <td>陈建斌</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>融媒体时代影视传播的形态及接受差异研究——评《影视传播学》</td>\n",
       "      <td>张建朔</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-06-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>论中国共产党新闻思想百年发展历史进程</td>\n",
       "      <td>郑保卫; 王青</td>\n",
       "      <td>社会科学战线</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>280.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>2020主题出版：政策引导、业界实践与理论研究</td>\n",
       "      <td>周蔚华; 何小凡</td>\n",
       "      <td>中国出版</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>47.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>理念·文本·叙事：主旋律纪录片融合创新的三重向度</td>\n",
       "      <td>李晨</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>79.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>融合、话语与共同体精神：论战“疫”系列短视频的创作与传播路径</td>\n",
       "      <td>位俊达</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>55.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>媒体融合背景下的非遗传播演变和挑战</td>\n",
       "      <td>杜晓晶; 徐佶</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>58.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>党媒融媒体工作室的新闻生产与创新研究</td>\n",
       "      <td>左小麟; 郑伊健</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>总结“十三五” 展望“十四五” 大力推动广电产业高质量发展</td>\n",
       "      <td>张先</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>106.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>阐释艺术内涵 展望产业未来——评《融媒时代我国数字影视传播与产业研究》</td>\n",
       "      <td>倪莉</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>75.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>融媒体时代文化传播创新路径——推荐《文化传播学》</td>\n",
       "      <td>田虹; 费艳颖</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>184.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>论报业融媒体经营中的社群私域流量建设——以《阳光少年报》社群运维为例</td>\n",
       "      <td>胡士才</td>\n",
       "      <td>新闻爱好者</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>融媒体时代下薪酬公平感对双元媒体人的影响——自我效能感的中介作用</td>\n",
       "      <td>吴珍妮; 娄世艳</td>\n",
       "      <td>重庆社会科学</td>\n",
       "      <td>2021-05-16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>71.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>融媒体视阈下电视公共服务节目创新路径探究</td>\n",
       "      <td>马玥; 刘丽群</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>119.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>融媒体背景下儿童绘本传播研究——以上海外语教育出版社“儿童英语阅读魔盒”为例</td>\n",
       "      <td>黄新炎; 李茜榕</td>\n",
       "      <td>出版广角</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>重大突发公共事件中基层媒体融合传播的创新与思考——以三家县级融媒体中心抗疫报道为例</td>\n",
       "      <td>陈月飞; 丁和根</td>\n",
       "      <td>当代传播</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>243.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>我国跨文化传播的困境与优化路径</td>\n",
       "      <td>段龙江</td>\n",
       "      <td>人民论坛</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>370.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>智媒时代县级融媒建设发展的制约瓶颈与应对策略</td>\n",
       "      <td>刘峰; 罗敦洲</td>\n",
       "      <td>出版发行研究</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>大数据时代主流媒体传播体系的融媒体化策略</td>\n",
       "      <td>张才明; 褚婉宏; 金韶</td>\n",
       "      <td>编辑学刊</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>28.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>融媒体语境下综合类期刊创新发展思考</td>\n",
       "      <td>李秀娟</td>\n",
       "      <td>编辑学刊</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>29.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>少儿传统文化图书创作理念及出版形式探析</td>\n",
       "      <td>韩青</td>\n",
       "      <td>编辑学刊</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>社会治理视阈下县级融媒体中心建设：功能定位与实践逻辑</td>\n",
       "      <td>李文冰; 吴莎琪</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>160.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>全媒体传播体系与四级融合新发展格局</td>\n",
       "      <td>胡正荣; 蒋东旭</td>\n",
       "      <td>中国编辑</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>241.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>2021年全国两会融媒体报道创新观察</td>\n",
       "      <td>王晓东</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>2021年全国两会总台融媒体报道特色</td>\n",
       "      <td>王莹; 覃垚; 戴冰</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>春天之约 英雄答卷——湖北广电全国两会报道纪实</td>\n",
       "      <td>郭小容</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>融媒传播中“媒资”价值变现路径初探——以丽水广电集团融媒体中心为例</td>\n",
       "      <td>董枫; 施龙有; 吴峰平</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>37.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>融媒体背景下内容产品的多元开发——以《笔尖上的诗词课》融合出版项目为例</td>\n",
       "      <td>鲁艳芳</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>浙江安吉新闻集团智慧化融合模式解析</td>\n",
       "      <td>乔秀峰</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>42.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>从市场竞合到纳入国家治理体系——中国媒介融合研究20年之语境变迁</td>\n",
       "      <td>栾轶玫</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>418.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>媒体融合的现状、难点与市场机制突破</td>\n",
       "      <td>陈国权</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>483.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>主流媒体平台建设的优势与短板——从三大央媒的平台实践看深化媒体融合</td>\n",
       "      <td>蔡雯; 汪惠怡</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>519.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>建强不易 用好更难</td>\n",
       "      <td>唐绪军</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>61.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>基层社会治理与县级融媒体中心建设</td>\n",
       "      <td>李蕾</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>306.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>媒体介入基层社会治理的现状、角色与维度</td>\n",
       "      <td>丁和根</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>373.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>参与式治理视角下县级融媒体的角色定位与发展路径</td>\n",
       "      <td>罗昕; 蔡雨婷</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>285.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>重建本地用户连接 融入基层社会治理：县级融媒体发展路径研究</td>\n",
       "      <td>曾润喜; 杨璨</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>343.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>融合人民：县级媒体融合与基层协同治理</td>\n",
       "      <td>沙垚; 许楠</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>274.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                         篇名            作者  \\\n",
       "0            1                     多元环境下学术期刊品牌建设的思考  网络首发           宋启凡   \n",
       "1            2                国家治理视角下县级融媒体中心传播功能再解读  网络首发            倪琳   \n",
       "2            3                      国家治理视角下县级融媒体中心传播功能再解读            倪琳   \n",
       "3            4         县级融媒体如何推动政治倾听与舆论引导——以河南省新冠肺炎疫情防控为例       杨逍; 王子丰   \n",
       "4            5              “使用与满足”的凸显——融媒体时代电视综艺节目发展的新思路       陈淼; 王云峰   \n",
       "5            6                     融媒体时代主题出版转型发展的探究  网络首发           区燕宜   \n",
       "6            7                          论融媒体素养对新文科人才培养的意义            白寅   \n",
       "7            8                       欠发达地区县级融媒体中心建设的难点与出路            李琴   \n",
       "8            9                       大河网2021年全国两会报道创新策略研究           王亚楠   \n",
       "9           10   融媒时代中国故事的叙事与传播创新策略——以人民日报《中国24小时》系列微视频为例            刘珊   \n",
       "10          11                     数据融通：县级融媒体中心的数据利用与功能发挥            沈晖   \n",
       "11          12                            南京广电的融媒体创新与升级策略      秘春茜; 李宏刚   \n",
       "12          13                  现象级融媒传播精品策划研究——以福州广播电视台为例           陈建斌   \n",
       "13          14              融媒体时代影视传播的形态及接受差异研究——评《影视传播学》           张建朔   \n",
       "14          15                         论中国共产党新闻思想百年发展历史进程       郑保卫; 王青   \n",
       "15          16                    2020主题出版：政策引导、业界实践与理论研究      周蔚华; 何小凡   \n",
       "16          17                   理念·文本·叙事：主旋律纪录片融合创新的三重向度            李晨   \n",
       "17          18             融合、话语与共同体精神：论战“疫”系列短视频的创作与传播路径           位俊达   \n",
       "18          19                          媒体融合背景下的非遗传播演变和挑战       杜晓晶; 徐佶   \n",
       "19          20                         党媒融媒体工作室的新闻生产与创新研究      左小麟; 郑伊健   \n",
       "20          21              总结“十三五” 展望“十四五” 大力推动广电产业高质量发展            张先   \n",
       "21          22        阐释艺术内涵 展望产业未来——评《融媒时代我国数字影视传播与产业研究》            倪莉   \n",
       "22          23                   融媒体时代文化传播创新路径——推荐《文化传播学》       田虹; 费艳颖   \n",
       "23          24         论报业融媒体经营中的社群私域流量建设——以《阳光少年报》社群运维为例           胡士才   \n",
       "24          25           融媒体时代下薪酬公平感对双元媒体人的影响——自我效能感的中介作用      吴珍妮; 娄世艳   \n",
       "25          26                       融媒体视阈下电视公共服务节目创新路径探究       马玥; 刘丽群   \n",
       "26          27     融媒体背景下儿童绘本传播研究——以上海外语教育出版社“儿童英语阅读魔盒”为例      黄新炎; 李茜榕   \n",
       "27          28  重大突发公共事件中基层媒体融合传播的创新与思考——以三家县级融媒体中心抗疫报道为例      陈月飞; 丁和根   \n",
       "28          29                            我国跨文化传播的困境与优化路径           段龙江   \n",
       "29          30                     智媒时代县级融媒建设发展的制约瓶颈与应对策略       刘峰; 罗敦洲   \n",
       "30          31                       大数据时代主流媒体传播体系的融媒体化策略  张才明; 褚婉宏; 金韶   \n",
       "31          32                          融媒体语境下综合类期刊创新发展思考           李秀娟   \n",
       "32          33                        少儿传统文化图书创作理念及出版形式探析            韩青   \n",
       "33          34                 社会治理视阈下县级融媒体中心建设：功能定位与实践逻辑      李文冰; 吴莎琪   \n",
       "34          35                          全媒体传播体系与四级融合新发展格局      胡正荣; 蒋东旭   \n",
       "35          36                         2021年全国两会融媒体报道创新观察           王晓东   \n",
       "36          37                         2021年全国两会总台融媒体报道特色    王莹; 覃垚; 戴冰   \n",
       "37          38                    春天之约 英雄答卷——湖北广电全国两会报道纪实           郭小容   \n",
       "38          39          融媒传播中“媒资”价值变现路径初探——以丽水广电集团融媒体中心为例  董枫; 施龙有; 吴峰平   \n",
       "39          40        融媒体背景下内容产品的多元开发——以《笔尖上的诗词课》融合出版项目为例           鲁艳芳   \n",
       "40          41                          浙江安吉新闻集团智慧化融合模式解析           乔秀峰   \n",
       "41          42           从市场竞合到纳入国家治理体系——中国媒介融合研究20年之语境变迁           栾轶玫   \n",
       "42          43                          媒体融合的现状、难点与市场机制突破           陈国权   \n",
       "43          44          主流媒体平台建设的优势与短板——从三大央媒的平台实践看深化媒体融合       蔡雯; 汪惠怡   \n",
       "44          45                                  建强不易 用好更难           唐绪军   \n",
       "45          46                           基层社会治理与县级融媒体中心建设            李蕾   \n",
       "46          47                        媒体介入基层社会治理的现状、角色与维度           丁和根   \n",
       "47          48                    参与式治理视角下县级融媒体的角色定位与发展路径       罗昕; 蔡雨婷   \n",
       "48          49              重建本地用户连接 融入基层社会治理：县级融媒体发展路径研究       曾润喜; 杨璨   \n",
       "49          50                         融合人民：县级媒体融合与基层协同治理        沙垚; 许楠   \n",
       "\n",
       "                刊名              发表时间  被引     下载  操作  \n",
       "0            科技与出版  2021-06-29 17:21 NaN    4.0  下载  \n",
       "1             东岳论丛  2021-06-23 14:01 NaN   76.0  下载  \n",
       "2             东岳论丛  2021-06-23 14:01 NaN    NaN  下载  \n",
       "3            新闻爱好者        2021-06-20 NaN   22.0  下载  \n",
       "4             中国电视        2021-06-15 NaN  296.0  下载  \n",
       "5            科技与出版  2021-06-11 13:33 NaN   97.0  下载  \n",
       "6             中国编辑        2021-06-10 NaN   77.0  下载  \n",
       "7               传媒        2021-06-10 NaN   28.0  下载  \n",
       "8               传媒        2021-06-10 NaN   15.0  下载  \n",
       "9               传媒        2021-06-10 NaN   30.0  下载  \n",
       "10              传媒        2021-06-10 NaN   19.0  下载  \n",
       "11              传媒        2021-06-10 NaN   23.0  下载  \n",
       "12              传媒        2021-06-10 NaN   13.0  下载  \n",
       "13           新闻与写作        2021-06-05 NaN   84.0  下载  \n",
       "14          社会科学战线        2021-06-01 NaN  280.0  下载  \n",
       "15            中国出版        2021-06-01 NaN   47.0  下载  \n",
       "16            当代电视        2021-06-01 NaN   79.0  下载  \n",
       "17            当代电视        2021-06-01 NaN   55.0  下载  \n",
       "18            当代电视        2021-06-01 NaN   58.0  下载  \n",
       "19            出版广角        2021-05-30 NaN    NaN  下载  \n",
       "20              传媒        2021-05-25 NaN  106.0  下载  \n",
       "21              传媒        2021-05-25 NaN   75.0  下载  \n",
       "22            新闻记者        2021-05-20 NaN  184.0  下载  \n",
       "23           新闻爱好者        2021-05-20 NaN   62.0  下载  \n",
       "24          重庆社会科学        2021-05-16 NaN   71.0  下载  \n",
       "25            中国电视        2021-05-15 NaN  119.0  下载  \n",
       "26            出版广角        2021-05-15 NaN   84.0  下载  \n",
       "27            当代传播        2021-05-15 NaN  243.0  下载  \n",
       "28            人民论坛        2021-05-15 NaN  370.0  下载  \n",
       "29          出版发行研究        2021-05-15 NaN   62.0  下载  \n",
       "30            编辑学刊        2021-05-15 NaN   28.0  下载  \n",
       "31            编辑学刊        2021-05-15 NaN   29.0  下载  \n",
       "32            编辑学刊        2021-05-15 NaN   21.0  下载  \n",
       "33  现代传播(中国传媒大学学报)        2021-05-15 NaN  160.0  下载  \n",
       "34            中国编辑        2021-05-10 NaN  241.0  下载  \n",
       "35              传媒        2021-05-10 NaN  103.0  下载  \n",
       "36              传媒        2021-05-10 NaN   52.0  下载  \n",
       "37              传媒        2021-05-10 NaN   10.0  下载  \n",
       "38              传媒        2021-05-10 NaN   37.0  下载  \n",
       "39              传媒        2021-05-10 NaN   32.0  下载  \n",
       "40              传媒        2021-05-10 NaN   42.0  下载  \n",
       "41            编辑之友        2021-05-05 NaN  418.0  下载  \n",
       "42            编辑之友        2021-05-05 NaN  483.0  下载  \n",
       "43            编辑之友        2021-05-05 NaN  519.0  下载  \n",
       "44           新闻与写作        2021-05-05 NaN   61.0  下载  \n",
       "45           新闻与写作        2021-05-05 NaN  306.0  下载  \n",
       "46           新闻与写作        2021-05-05 NaN  373.0  下载  \n",
       "47           新闻与写作        2021-05-05 NaN  285.0  下载  \n",
       "48           新闻与写作        2021-05-05 NaN  343.0  下载  \n",
       "49           新闻与写作        2021-05-05 NaN  274.0  下载  "
      ]
     },
     "execution_count": 748,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看页面信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "firstpage_html = element.get_attribute('innerHTML')\n",
    "firstpage_html\n",
    "pd.read_html(firstpage_html)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出refworks文件（.txt）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 620,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 566,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a',\n",
       " '/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input',\n",
       " '/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/i',\n",
       " '/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/a',\n",
       " '/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/ul/li[8]/a']"
      ]
     },
     "execution_count": 566,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将循环时需要点击的元素做成列表\n",
    "piliang_list = []\n",
    "piliang_list.append('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a')\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/i\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/a\")\n",
    "piliang_list.append(\"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/ul/li[8]/a\")\n",
    "piliang_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 567,
   "metadata": {},
   "outputs": [],
   "source": [
    "opts = webdriver.ChromeOptions()\n",
    "prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'C:/Users/喜东东/Desktop/web/验证码/ref文件'}\n",
    "opts.add_experimental_option('prefs', prefs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 571,
   "metadata": {},
   "outputs": [],
   "source": [
    "for page in range(0,18):  #循环17页，总共>800条论文信息\n",
    "    for i in piliang_list:  #循环点击元素的列表，依次点击\n",
    "        element = driver.find_element_by_xpath(i)\n",
    "        element.click()\n",
    "        time.sleep(2)\n",
    "    driver.switch_to.window(driver.window_handles[2]) #定位新窗口\n",
    "    time.sleep(8)\n",
    "    driver.find_element_by_xpath('//*[@id=\"litotxt\"]/a').click() #点击导出\n",
    "    time.sleep(8)\n",
    "    driver.close()#关闭新窗口\n",
    "    time.sleep(4)\n",
    "    driver.switch_to.window(driver.window_handles[1]) #定位回原窗口\n",
    "    driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click() #点击下一页\n",
    "    time.sleep(9)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 下载pdf文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 回到首页\n",
    "element = driver.find_element_by_xpath('//*[@id=\"total\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 封装"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 749,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.switch_to.window(driver.window_handles[1])\n",
    "# 封装判断验证码\n",
    "def img_have():\n",
    "    try:\n",
    "        driver.find_element_by_xpath('//*[@id=\"vImg\"]')#验证码图片\n",
    "        return True#为真，即有验证码图片存在\n",
    "    except:\n",
    "        return False#反之为空则没有验证码，返回False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 750,
   "metadata": {},
   "outputs": [],
   "source": [
    "#封装截图\n",
    "def img():\n",
    "    driver.find_element_by_xpath('//*[@id=\"vImg\"]').click()\n",
    "    time.sleep(2)\n",
    "    screenImg = r\"C:\\Users\\喜东东\\Desktop\\web\\screenImg.png\"#本地保存位置\n",
    "    driver.get_screenshot_as_file(screenImg)\n",
    "    location = driver.find_element_by_xpath('//*[@id=\"vImg\"]').location#验证码位置\n",
    "    size = driver.find_element_by_xpath('//*[@id=\"vImg\"]').size#截图大小\n",
    "    left = location['x']+100\n",
    "    top = location['y']+70                      #调整验证码截图位置，提高准确度\n",
    "    right = location['x'] + size['width']+250  \n",
    "    bottom = location['y'] + size['height']+80\n",
    "    img = Image.open(screenImg).crop((left, top, right, bottom))\n",
    "    # 优化图片\n",
    "    img = img.convert('RGBA')\n",
    "    img = img.convert('L')\n",
    "    img = ImageEnhance.Contrast(img)\n",
    "    img = img.enhance(2.0)\n",
    "    img.save(screenImg)\n",
    "    img = Image.open(screenImg)\n",
    "    return img\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 751,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 封装图鉴API\n",
    "uname = \"choogohah\"\n",
    "pwd = \"783639524\"\n",
    "typeid  = \"3\"\n",
    "def base64_api(uname, pwd, img, typeid):\n",
    "    with open('C:/Users/喜东东/Desktop/web/screenImg.png','rb') as f:    \n",
    "        base64_data = base64.b64encode(f.read())\n",
    "        img = base64_data.decode()\n",
    "    data = {\"username\": uname, \"password\": pwd, \"typeid\": typeid, \"image\": img}\n",
    "    time.sleep(1)\n",
    "    result = json.loads(requests.post(\"http://api.ttshitu.com/predict\", json=data).text)\n",
    "    if result['success']:\n",
    "        return result[\"data\"][\"result\"]\n",
    "    else:\n",
    "        return result[\"message\"]\n",
    "    return \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 752,
   "metadata": {},
   "outputs": [],
   "source": [
    "#封装输入码\n",
    "def shuruma():\n",
    "    yanzhengma = base64_api(uname, pwd, img, typeid)\n",
    "    driver.find_element_by_xpath('//*[@id=\"vcode\"]').clear()\n",
    "    element = driver.find_element_by_xpath('//*[@id=\"vcode\"]').send_keys(yanzhengma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 753,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 页面切换\n",
    "def qiehuan():\n",
    "    all_window=driver.window_handles\n",
    "    new_page = len(all_window)\n",
    "    driver.switch_to.window(driver.window_handles[new_page-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 754,
   "metadata": {},
   "outputs": [],
   "source": [
    "#封装正确情况判断\n",
    "def true():\n",
    "    qiehuan()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[2])\n",
    "    time.sleep(1)\n",
    "    qiehuan()\n",
    "    driver.close()\n",
    "    driver.switch_to.window(driver.window_handles[1])\n",
    "    time.sleep(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 755,
   "metadata": {},
   "outputs": [],
   "source": [
    "#封装检查下载文件数\n",
    "def wenjian1():\n",
    "    path = r'C:\\Users\\喜东东\\Desktop\\web\\验证码\\pdf文件'#（需在selenium浏览器更改下载文件存储的地址）\n",
    "    file = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))#通过len和int将验证码输入前的文件夹里的文件数量进行检测，获取数量为未输入验证码的文件量，需调用os模块\n",
    "    return file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 756,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wenjian2():\n",
    "    path = r'C:\\Users\\喜东东\\Desktop\\web\\验证码\\pdf文件'#（需在selenium浏览器更改下载文件存储的地址）\n",
    "    file2 = int(len([lists for lists in os.listdir(path) if os.path.isfile(os.path.join(path, lists))]))#通过len和int将验证码输入后的文件夹里的文件数量进行检测，获取数量为输入验证码后文件夹里的文件量，需调用os模块\n",
    "    return file2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 757,
   "metadata": {},
   "outputs": [],
   "source": [
    "#判断有无下载\n",
    "def wenjian3():\n",
    "    if file2 > file:#如果输入验证码后的文件夹里文件数量增加，也就是数量大于输入前文件数量，则代表下载成功，验证码输入正确，返回True；反之则失败，输入错误，没有下载，文件数量没有大于前一次，返回False。\n",
    "        return True\n",
    "    else:\n",
    "        return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 758,
   "metadata": {},
   "outputs": [],
   "source": [
    "#返回列表页 报错的时候重新运行时方便用\n",
    "driver.switch_to.window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 循环遍历下载pdf档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for page in range(1,4): #选择爬取页数\n",
    "    for link in range(1,50):#可选择从一页中的第几篇文章开始挖取到第几篇文章结束挖取\n",
    "        pdf_xpath = '//*[@id=\"gridTable\"]/table/tbody/tr[{}]/td[2]/a'.format(link)\n",
    "        driver.find_element_by_xpath(pdf_xpath).click()#点击文章进入文章详情页\n",
    "        time.sleep(1)\n",
    "        driver.switch_to.window(driver.window_handles[2])#定位新窗口\n",
    "        \n",
    "        driver.find_element_by_xpath('//*[@id=\"pdfDown\"]').click()#点击下载\n",
    "        time.sleep(3)\n",
    "        qiehuan()#切换到最新页面\n",
    "        time.sleep(4)\n",
    "        img_have()#判断是否弹出验证码窗口\n",
    "        abc = img_have()\n",
    "        if abc is True:#如果有验证码窗口\n",
    "            img()#爬取验证码\n",
    "            time.sleep(2)\n",
    "            base64_api(uname, pwd, img, typeid)#调用图鉴api识别验证码内容\n",
    "            shuru()#填写识别出来的验证码内容\n",
    "            wenjian1()#检测下载文件夹内的文件数量\n",
    "            file = wenjian1()\n",
    "            time.sleep(1)\n",
    "            driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()#点击验证码提交\n",
    "            wenjian2()#检测点击下载后文件夹内的文件数量\n",
    "            file2 = wenjian2()\n",
    "            time.sleep(1)\n",
    "            wenjian3()#判断下载文件夹内的文件数量是否变化\n",
    "            abc2 = wenjian3()\n",
    "            while abc2 == False:#如果数量没有增加，则返回False，判断等于False，则重新进行验证码爬取识别以及输入并再次检测\n",
    "                img()#爬取验证码\n",
    "                base64_api(uname, pwd, img, typeid)#调用图鉴api识别验证码内容\n",
    "                shuruma()#填写识别出来的验证码内容\n",
    "                wenjian1()#检测下载文件夹内的文件数量\n",
    "                file = wenjian1()\n",
    "                driver.find_element_by_xpath('/html/body/div/form/dl/dd/button').click()#点击验证码提交\n",
    "                wenjian2()#检测点击下载后文件夹内的文件数量\n",
    "                file2 = wenjian2()\n",
    "                wenjian3()#判断下载文件夹内的文件数量是否变化\n",
    "                abc2 = wenjian3()\n",
    "                if abc2 == True:#如果判断等于True，则代表输入正确下载成功\n",
    "                    break\n",
    "            true()#下载成功，情况正确，关闭窗口，返回原窗口，进行下一篇文章PDF原文文件下载   \n",
    "        else:#没有验证码窗口，则直接下载原文文件，然后关闭详情页，返回文章集合页，继续下一篇PDF原文文件下载\n",
    "            qiehuan()\n",
    "            driver.close()\n",
    "            driver.switch_to.window(driver.window_handles[1])\n",
    "            time.sleep(1)\n",
    "    driver.find_element"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "341.333px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
